---
title: "03_preprocess_data"
author: "Jae Yeon Kim"
date: "7/2/2020"
output: html_document
---

# Import packages and files 

## Packages

```{r}

pacman::p_load(tidyverse, # for tidyverse 
               ggpubr, # for arranging ggplots   
               ggthemes, # for fancy ggplot themes
               here, # for reproducibility 
               patchwork, # for easy ggarrange
               ggsci, # for pubs 
               fastDummies, # to create dummy variables fast
               readtext, # for reading text
               quanteda, # for text preprocessing 
               data.table, # for fast data manipulation
               stm, # for structural topic modeling
               future, # for parallel and distributed computing      
               purrr, # for functional programming 
               keyATM, # keyATM
               latex2exp)

# for publication-friendly theme 
theme_set(theme_pubr())

# custom functions
source(here("functions", "stacked_bar_plot.R"))
source(here("functions", "visualize_diag.R"))
source(here("functions", "date2index.R"))

# For keyword based topic modeling (development version)
devtools::install_github("keyATM/keyATM", ref = "package_dev")

library(keyATM)
```

## Files 

- Filter 90% of the data (by `chinese` and `wuhan` variables)

```{r}

# Subset 
df <- data.table::fread(here("processed_data", "cleaned_data.csv"))[,-1] 

small_df <- df %>% filter(chinese == 1 | wuhan == 1 | asian == 1)

1 - nrow(small_df)/nrow(df)

# Filter (doc length 0 ones)

small_df <- small_df[-c(2930, 25557, 33569, 49217, 49784, 51572, 71270, 89453, 90285, 102360, 108111, 110351, 113704, 116119, 120643, 121220, 122300, 122695, 125647, 133053),]

# Add intervention variable 
small_df$intervention <- ifelse(small_df$date >= "2020-03-16", 1, 0)
```

# Preprocess 

```{r}

# Build a corpus 
my_corpus <- corpus(small_df$full_text)

# Add the document-level covariates 
docvars(my_corpus, "wuhan") <- small_df$wuhan

docvars(my_corpus, "asian") <- small_df$asian

docvars(my_corpus, 
"chinese") <- small_df$chinese

docvars(my_corpus, 
"trump") <- small_df$trump %>% as.factor()

docvars(my_corpus, 
"intervention") <- small_df$intervention %>% as.factor()

# Date 

docvars(my_corpus, 
"date") <- small_df$date

# Month 

docvars(my_corpus, 
"month") <- lubridate::month(small_df$date)

# Date into index 

docvars(my_corpus) <- date2index(my_corpus)

write_rds(my_corpus, here("outputs", "my_corpus.rds"))

my_corpus <- read_rds(here("outputs", "my_corpus.rds"))

```

```{r}
# Tokenize 
data_tokens <- tokens(my_corpus,
                      remove_url = TRUE) %>%
    tokens_remove(c(stopwords("english"),
                               "may", "shall", "can",
                               "must", "upon", "with", "without",
                               "covid", "covid19", "covid-19")) 

```

# Document-term matrix 

```{r}
# Construct a document-term matrix 

data_dfm <- dfm(data_tokens) %>%
    dfm_trim(min_termfreq = 100, 
             min_docfreq = 100)

```


```{r}
write_rds(data_dfm, here("processed_data", "data_dfm.rds"))

data_dfm <- read_rds(here("processed_data", "data_dfm.rds"))
```

# KeyATM

## Prepare the data 

```{r}
# Prepare the data for keyATM

future::plan("multiprocess")

tictoc::tic()
keyATM_docs <- keyATM_read(texts = data_dfm)
tictoc::toc()

# 243.578 sec elapsed

# Export 
write_rds(keyATM_docs, here("processed_data", 
                            "keyATM_docs.rds"))

keyATM_docs <- read_rds(here("processed_data", "keyATM_docs.rds"))

```

## Create a dictionary of the key words 

```{r}

keywords <- list(
    
    "Anti-Asian" = c("wuhanvirus", "chinesevirus", "chinavirus", "wuhancoronavirus", "wuhanpneumonia", "ccpvirus", "chinaliedpeopledied"),
                 
     "Anti-racism" = c("antiracism", "stophate", "acttochange", "stopaapihate", "stophatecrimes", "antiasian", "racism", "racist")
    
    )

```


## Check keywords 

```{r}
key_viz <- visualize_keywords(docs = keyATM_docs, 
                              keywords = keywords)

save_fig(key_viz, here("outputs", "keyword.png")) 

vf <- values_fig(key_viz) 

key_viz
```

## Number of K

```{r}

# future::plan(multiprocess)

# Run many models 
many_models <- tibble(K = c(3:5)) %>%
               mutate(topic_model = furrr::future_map(K, ~stm(data_dfm, 
                                                       K = .,
                                                       verbose = TRUE)))

write_rds(many_models, here("outputs", "many_models.rds"))

many_models <- read_rds(here("outputs", "many_models.rds"))

```

```{r}

# Resolve conflicts 

conflicted::conflict_prefer("purrr", "map")

k_search_diag <- visualize_diag(data_dfm, many_models)

ggsave(here("outputs", "k_search_diag.png"))

```

## Static topic modeling 

```{r}

future::plan("multiprocess")

out <- keyATM(docs = keyATM_docs,       # text input
              no_keyword_topics = 1,    # number of topics without keywords
              keywords = keywords,      # keywords
              model = "base",           # select the model
              options = list(seed = 250,
              store_theta = TRUE))

write_rds(out, here("outputs", "keyATM_out.rds"))

out <- read_rds(here("outputs", "keyATM_out.rds"))

# theta = document-topic distribution 
out$theta <- round(out$theta, 0)

# sum 
sums <- c(sum(out$theta[,1]), sum(out$theta[,2]), sum(out$theta[,3]))

```

```{r}
topic_out <- tibble(topic_sums = sums,
                    names = c("Anti-Asian", "Anti-racism","Others")) %>%  
           mutate(prop = topic_sums / sum(topic_sums),
           prop = round(prop,2))

topic_out %>% 
    ggplot(aes(x = names, y = prop)) +
    geom_col(position = "dodge") +
    scale_y_continuous(labels =    
    scales::percent_format(accuracy = 1)) +
    labs(x = "Topic name", 
         y = "Topic proportion",          
         title = "Topic-document distributions",
          subtitle = "Tweets mentioned COVID-19 and either Asian, Chinese, or Wuhan related words") 

ggsave(here("outputs", "topic_modeling_static.png"))
    
```

## Covariate topic modeling 

```{r}

# Extract covariates 
vars <- docvars(my_corpus)

vars_selected <- vars %>% select(intervention) %>%
    mutate(intervention = ifelse(intervention == 1, "Post-Trump speech", "Pre-Trump speech"))

# Topic modeling 
covariate_out <- keyATM(docs = keyATM_docs,       # text input
              no_keyword_topics = 1,    # number of topics without keywords
              keywords = keywords,      # keywords
              model = "covariate",           # select the model
              model_settings = list(covariates_data = vars_selected,
                                    covariates_formula = ~ intervention),
              options = list(seed = 250,
              store_theta = TRUE))

```

```{r}
# Predicted mean of the document-term distribution for intervention 
strata_topic <- by_strata_DocTopic(out, by_var = "intervention",
                                   labels = c("Post-Trump speech", "Pre-Trump speech"))

est <- summary(strata_topic)

# Baseline 
new_data <- covariates_get(covariate_out)

new_data[, "intervention"] <- 0 

pred <- predict(covariate_out, new_data, label = "Others")

# Bind them together 
res <- bind_rows(est, pred)

labels <- unique(res$label)

ggplot(res, aes(x = label, ymin = Lower, ymax = Upper, group = Topic)) +
  geom_errorbar(width = 0.1) +
  coord_flip() +
  facet_wrap(~Topic) +
  geom_point(aes(x = label, y = Point)) +
  scale_x_discrete(limits = rev(labels)) +
  xlab("Intervention") +
  scale_y_continuous(labels =    
  scales::percent_format(accuracy = 1)) 
```

## Dynamic topic modeling 

````{r}

tictoc::tic()
dynamic_out_day <- keyATM(docs = keyATM_docs,    # text input
                      no_keyword_topics = 1,              # number of topics without keywords
                      keywords = keywords,       # keywords
                      model = "dynamic",         # select the model
                      model_settings = list(time_index = docvars(my_corpus)$index,                                          num_states = 5),
                      options = list(seed = 250, store_theta = TRUE, thinning = 5))
tictoc::toc()

# Save 
write_rds(dynamic_out_day, here("outputs", "dynamic_out_day.rds"))

```

```{r}
dynamic_out_day <- read_rds(here("outputs", "dynamic_out_day.rds"))

# Visualize 
fig_timetrend_day <- plot_timetrend(dynamic_out_day, time_index_label = as.Date(docvars(my_corpus)$date), xlab = "Date", width = 5) 

keyATM::save_fig(fig_timetrend_day, here("outputs", "dynamic_topic_day.png"))

# Alt visualize 

df <- data.frame(date = fig_timetrend_day$values$time_index,
                mean = fig_timetrend_day$values$Point,
                upper = fig_timetrend_day$values$Upper,
                lower = fig_timetrend_day$values$Lower,
                topic = fig_timetrend_day$values$Topic)

```

```{r}
df %>% ggplot() +
    geom_line(aes(x = date, y = mean),
              alpha = 0.5, size = 1.2) +
    geom_ribbon(aes(x = date, y = mean, ymax = upper, ymin = lower),
                alpha = 0.3) +
    geom_smooth(aes(x = date, y = mean, ymax = upper, ymin = lower),
                method = "loess", 
                size = 1.5, 
                span = 0.3,
                color = "black") + # for given x, loess will use the 0.3 * N closet poitns to x to fit. source: https://rafalab.github.io/dsbook/smoothing.html
    labs(title = "Topic trends over time",
         subtitle = "Tweets mentioned COVID-19 and either Asian, Chinese, or Wuhan",
         x = "Date", 
         y = "Topic proportion") +
    facet_wrap(~topic) +
    geom_vline(xintercept = as.Date(c("2020-03-16")),
               linetype = "dashed",
               size = 1.2,
               color = "black") +
    scale_y_continuous(labels =    
    scales::percent_format(accuracy = 1)) 

ggsave(here("outputs", "anti_asian_topic_dynamic_trend.png"))

```